# Install required libraries
packages <- c('leaflet','dplyr','data.table','sp', 'rgeos', 'raster', 
'rgdal','GISTools','magrittr','BSDA', 'PASWR','broom','tidyverse','gtools')

for(p in packages){
  if(!require(p,character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}
Loading required package: leaflet
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Loading required package: dplyr
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: data.table
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
data.table 1.12.8 using 4 threads (see ?getDTthreads).  Latest news: r-datatable.com

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

Loading required package: sp
Loading required package: rgeos
rgeos version: 0.5-2, (SVN revision 621)
 GEOS runtime version: 3.7.2-CAPI-1.11.2 
 Linking to sp version: 1.3-1 
 Polygon checking: TRUE 

Loading required package: raster

Attaching package: ‘raster’

The following object is masked from ‘package:data.table’:

    shift

The following object is masked from ‘package:dplyr’:

    select

Loading required package: rgdal
rgdal: version: 1.4-8, (SVN revision 845)
 Geospatial Data Abstraction Library extensions to R successfully loaded
 Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
 Path to GDAL shared files: /Users/jayneteo/Library/R/3.6/library/rgdal/gdal
 GDAL binary built with GEOS: FALSE 
 Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
 Path to PROJ.4 shared files: /Users/jayneteo/Library/R/3.6/library/rgdal/proj
 Linking to sp version: 1.3-2 
Loading required package: GISTools
Loading required package: maptools
Checking rgeos availability: TRUE
Loading required package: RColorBrewer
Loading required package: MASS

Attaching package: ‘MASS’

The following objects are masked from ‘package:raster’:

    area, select

The following object is masked from ‘package:dplyr’:

    select

Loading required package: magrittr

Attaching package: ‘magrittr’

The following object is masked from ‘package:raster’:

    extract

Loading required package: BSDA
Loading required package: lattice

Attaching package: ‘BSDA’

The following object is masked from ‘package:datasets’:

    Orange

Loading required package: PASWR
Loading required package: e1071

Attaching package: ‘e1071’

The following object is masked from ‘package:raster’:

    interpolate


Attaching package: ‘PASWR’

The following objects are masked from ‘package:BSDA’:

    Chips, CIsim, Combinations, Depend, EDA, Engineer, Grades, Kinder, normarea, nsize, ntester, Phone, Rat, Salinity, SIGN.test, Soccer, SRS, tsum.test, Wool, z.test, zsum.test

Loading required package: broom
Loading required package: tidyverse
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.0     ✓ purrr   0.3.3
✓ tibble  2.1.3     ✓ stringr 1.4.0
✓ tidyr   1.0.2     ✓ forcats 0.4.0
✓ readr   1.3.1     
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x data.table::between() masks dplyr::between()
x tidyr::extract()      masks magrittr::extract(), raster::extract()
x dplyr::filter()       masks stats::filter()
x data.table::first()   masks dplyr::first()
x dplyr::lag()          masks stats::lag()
x data.table::last()    masks dplyr::last()
x MASS::select()        masks raster::select(), dplyr::select()
x purrr::set_names()    masks magrittr::set_names()
x purrr::transpose()    masks data.table::transpose()
Loading required package: gtools

Attaching package: ‘gtools’

The following object is masked from ‘package:e1071’:

    permutations
'%!in%' <- function(x,y)!('%in%'(x,y))
# Install required libraries
data <- read.csv("data/realis2018.csv")
head(data)

Problem Description

One day, your mum tells you that we just won the first prize of TOTO which worth 2,500,000 SGD. After you discuss with your family, you decide to buy a flat and plan for the investment. During the period of time, you contact the company “Property Master@” to discuss more on the historical transaction of Singapore property. The sample data givenis “realis2018.csv”. For problems stated below, we use α= 5%.

# clean data

# filter no. of units = 1 cos some transactions are the whole damn building
data %<>% filter(No..of.Units==1)

# recode YISHUN and Yishun
data$Planning.Area <- recode(data$Planning.Area,"YISHUN"="Yishun")

Problem 1

You look around few different planning areas (Column R). When you ask the agent what is the mean Unit price(psm) for Newton flats (Column G), she claims that the mean is higher than 26500. Do you agree with your agent’s suggestion? Explain and justify your answer.

Assumptions:

  • Assume “Flat” means either Condominium, Executive Condominium or Apartment
  • Data is a sample of housing prices in 2018

Code

Newton <-  data %>%  filter(Planning.Area=="Newton",Property.Type %in% c("Condominium", "Apartment", "Executive Condominium"), No..of.Units == "1")
head(Newton)
z.test(Newton$Unit.Price....psm, alternative = "greater", mu= 26500,sigma.x=sd(Newton$Unit.Price....psm))

    One-sample z-Test

data:  Newton$Unit.Price....psm
z = 1.881, p-value = 0.02998
alternative hypothesis: true mean is greater than 26500
95 percent confidence interval:
 26598.75      Inf
sample estimates:
mean of x 
 27286.51 

Detailed Analysis

At 95% Confidence level, we have sufficient evidence to say that the mean price per square meter(psm) of flats in the Newton Area is more than $26500.

Problem 2

Your friend told you that Newton planning area may not be the best area to choose. He suggested you to consider other planning areas. This is a very difficult decision since you need to conduct a more comprehensive analysis and you also need to justify whether you still choose Newton or another planning area.

Assumptions:

  • Assume no preference for planning area
  • Assume purchasing in Q1 2020
  • The budget is 2.5 million
  • Assume “Flat” means either Condominium, Executive Condominium or Apartment
  • Key factors to consider: Accesibility, Estate Maturity
  • Metric for evaluation is PSM

Code

Distribution of Properties <= 2.5 Million across Planning Areas

realis <- fread('data/realis2018.csv')
realis$pa <- toupper(realis$`Planning Area`)
centroids <- readOGR("data/MP14_PLNG_AREA_WEB_PL.shp")
OGR data source with driver: ESRI Shapefile 
Source: "/Users/jayneteo/Desktop/ASSR - Project/TakeHome/data/MP14_PLNG_AREA_WEB_PL.shp", layer: "MP14_PLNG_AREA_WEB_PL"
with 55 features
It has 12 fields
dgp <-  spTransform(centroids, CRS("+proj=longlat +ellps=GRS80"))
one_unit <- subset(realis, realis$`No. of Units` == 1 & realis$`Transacted Price ($)` <= 2500000)
pa_units <- aggregate(realis$`No. of Units`,
                      by = list(realis$pa),
                      FUN = sum)
colnames(pa_units) = c('PA', 'Units')
m <- merge(dgp,pa_units, by.x ='PLN_AREA_N', by.y = 'PA')

pal <-
  colorBin(palette = brewer.pal(10,"YlGnBu"),
           domain = c(0,2000),
           na.color = "#00000000",
           bins=c(0,5,10,50,100,200,400,600,800,1000,1200,1400,1600,1800,2000))
n too large, allowed maximum for palette YlGnBu is 9
Returning the palette you asked for with that many colors
# create the base map, default will be openstreetmap if not selected 
# added centroids point as well
leaflet(dgp) %>% addTiles() %>% 
                 addPolygons(fillColor = ~pal(m$Units),
                             weight = 2,
                             opacity = 1,
                             color = "grey",
                             dashArray = "1",
                             fillOpacity = 0.8) %>% 
                 addLegend("topright", pal, values=(0:2000), 
                           title = "Transacted", 
                           labFormat = labelFormat(suffix = " Units", between = '-')) 
pa_units[order(-pa_units$Units),]

Available Flats by Planning Area

Total Stock

stock_data <- fread("data/stock2019Q4.csv")
stock_data$PA <- toupper(stock_data$PA)
stock <- merge(dgp,stock_data, by.x ='PLN_AREA_N', by.y = 'PA')

pal <-
  colorBin(palette = brewer.pal(10,"YlGnBu"),
           domain = c(0,2000),
           na.color = "#00000000",
           bins=c(0,500,1000,3000,5000,8000,10000,15000,20000,30000))
n too large, allowed maximum for palette YlGnBu is 9
Returning the palette you asked for with that many colors
# create the base map, default will be openstreetmap if not selected 
# added centroids point as well
leaflet(dgp) %>% addTiles() %>% 
                 addPolygons(fillColor = ~pal(stock$Total),
                             weight = 2,
                             opacity = 1,
                             color = "grey",
                             dashArray = "1",
                             fillOpacity = 0.8) %>% 
                 addLegend("topright", pal, values=(0:2000), 
                           title = "Total Stock", 
                           labFormat = labelFormat(suffix = " Units", between = '-')) 
stock_data[order(-stock_data$Total),]

ANOVA and Tukey on PSM per Planning Area for each Flat Type


PropType <- unique(data$Property.Type)
for (k in 1:length(unique(data$Property.Type))){
  data_property <-  data %>%  filter(Property.Type==unique(data$Property.Type)[k])
    res.aov <- aov(Unit.Price....psm.~Planning.Area,data=data_property)
    summary(res.aov)
    results <-  tidy(TukeyHSD(res.aov,ordered=TRUE))
    results_sorted <-  results %>% separate(comparison, c("Bigger", "Smaller"),sep = "-")
    
  
    rankings1 <-  results_sorted %>%  group_by(Bigger) %>%  summarise(Count=n()) %>%  arrange(desc(Count))
    rankings2 <- results_sorted %>% filter(Smaller %!in% rankings1$Bigger) %>%  group_by(Smaller) %>% summarise(Count=n())%>%  arrange(Count)
    names(rankings2)[1] <- "Bigger"
    rankings <- rbind(rankings1,rankings2)

    rankings$Rank <-  seq.int(nrow(rankings)) 
    rankings %<>% dplyr::select(-Count)
    results_sorted <- left_join(results_sorted, rankings) %>%  arrange(Rank)

 
    rankings$TukeyRank <- NA
    rankings$TukeyRank[1] <- 1
    for( i in 2:nrow(rankings)){
      if(results_sorted$adj.p.value[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$Smaller == rankings$Bigger[i]]<=0.05){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
      } else if(sum((results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$adj.p.value<=0.05] %in% results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i]&results_sorted$adj.p.value>0.05])>0)){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
       } else {
         rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]
       }
     }
    
    Tukey_ranked <-  rankings %>%  dplyr::select(Bigger,TukeyRank)
    names(Tukey_ranked)[1] <-  "Planning.Area"
    Planning_Mean <-  data_property %>% group_by(Planning.Area) %>%  summarise(mean= mean(Unit.Price....psm.), sd= sd(Unit.Price....psm.))
    Tukey_ranked <- left_join(Tukey_ranked, Planning_Mean)
    
    assign(paste("aov_", PropType[k], sep = ""), tidy(res.aov))
    assign(paste("Tukey_", PropType[k], sep = ""), as.data.frame(Tukey_ranked))
}
Joining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vector
head(Tukey_Apartment)
head(Tukey_Condominium)
head(`Tukey_Executive Condominium`)
head(`Tukey_Detached House`)
head(`Tukey_Semi-Detached House`)
head(`Tukey_Terrace House`)

summary(res.aov)
                Df    Sum Sq   Mean Sq F value Pr(>F)    
Planning.Area   14 2.071e+09 147906034   216.6 <2e-16 ***
Residuals     1731 1.182e+09    682842                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Detailed analysis 

#We performed the ANOVA test to determine if all the mean PSM per planning areas were similar.
#At 95% Confidence level, as P value is less than 0.05, we have sufficient evidence to say to reject H0 and that the mean price per square meter(psm) of flats are significantly different
#To compare the group means, a post hoc test -  TUKEY test - was performed per property type 
#Given the extensive results output, the TUKEY results were sorted for clarity. 
#For each property type, the differences were sorted, filtered for results with adjusted p-value <=0.05 and then ranked accordingly 
#At 95% confidence intervals:
#   For property type - apartments - the Orchard area compared with the other planning areas, has the most significantly different differences between means. River Valley, Newton and Downtown core are ranked 2nd.
#   For property type - Condominium - the Orchard and River Vallye areas compared with the other planning areas, have the most significantly different differences between means. The Newton area comes in second
#   For property type - EC - the Bishan area compared with the other planning areas, has the most significantly different differences between means.Sengkang, Punggol areas come in second 
#   For property type - Detached House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Tanglin and Novena / Southern islands (Sentosa) are ranked second and third respectively
#   For property type - Semi-Detached House - the River Valley area compared with the other planning areas, has the most significantly different differences between means. The Tanglin, Marine Parade areas are ranked second
#   For property type - Terrance House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Rochor, River Valley and Novea are joint second. 

ANOVA and Tukey on Transacted Price per Planning Area for each Flat Type

# output the damn tukey table and sort
# jayne do anova



for (k in 1:length(unique(data$Property.Type))){
  data_property <-  data %>%  filter(Property.Type==unique(data$Property.Type)[k])
    res.aov <- aov(Transacted.Price....~Planning.Area,data=data_property)
    summary(res.aov)
    results <-  tidy(TukeyHSD(res.aov,ordered=TRUE))
    results_sorted <-  results %>% separate(comparison, c("Bigger", "Smaller"),sep = "-")
    
  
    rankings1 <-  results_sorted %>%  group_by(Bigger) %>%  summarise(Count=n()) %>%  arrange(desc(Count))
    rankings2 <- results_sorted %>% filter(Smaller %!in% rankings1$Bigger) %>%  group_by(Smaller) %>% summarise(Count=n())%>%  arrange(Count)
    names(rankings2)[1] <- "Bigger"
    rankings <- rbind(rankings1,rankings2)

    rankings$Rank <-  seq.int(nrow(rankings)) 
    rankings %<>% dplyr::select(-Count)
    results_sorted <- left_join(results_sorted, rankings) %>%  arrange(Rank)

 
    rankings$TukeyRank <- NA
    rankings$TukeyRank[1] <- 1
    for( i in 2:nrow(rankings)){
      if(results_sorted$adj.p.value[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$Smaller == rankings$Bigger[i]]<=0.05){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
      } else if(sum((results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$adj.p.value<=0.05] %in% results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i]&results_sorted$adj.p.value>0.05])>0)){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
       } else {
         rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]
       }
     }
    
    Tukey_ranked <-  rankings %>%  dplyr::select(Bigger,TukeyRank)
    names(Tukey_ranked)[1] <-  "Planning.Area"
    Planning_Mean <-  data_property %>% group_by(Planning.Area) %>%  summarise(mean= mean(Transacted.Price....), sd= sd(Transacted.Price....))
    Tukey_ranked <- left_join(Tukey_ranked, Planning_Mean)
    
    assign(paste("aov_", PropType[k], sep = ""), tidy(res.aov))
    assign(paste("Tukey_", PropType[k], sep = ""), as.data.frame(Tukey_ranked))
}
Joining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"
Joining, by = "Planning.Area"
Column `Planning.Area` joining character vector and factor, coercing into character vector
head(Tukey_Apartment)
head(Tukey_Condominium)
head(`Tukey_Executive Condominium`)
head(`Tukey_Detached House`)
head(`Tukey_Semi-Detached House`)
head(`Tukey_Terrace House`)
summary(res.aov)
                Df    Sum Sq   Mean Sq F value Pr(>F)    
Planning.Area   14 1.689e+13 1.206e+12   65.86 <2e-16 ***
Residuals     1731 3.170e+13 1.831e+10                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#Detailed analysis 

#We performed the ANOVA test to determine if the mean transacted price per planning areas were similar.
#At 95% Confidence level, as P value is less than 0.05, we have sufficient evidence to say to reject H0 and that the mean transacted price of flats are significantly different
#To compare the group means, a post hoc test -  TUKEY test - was performed for each property type 
#Given the extensive results output, the TUKEY results were sorted for clarity. 
#For each property type, the differences were sorted, filtered for results with adjusted p-value <=0.05 and then ranked accordingly 
#At 95% confidence intervals:
#   For property type - apartments - the Orchard area compared with the other planning areas, has the most significantly different differences between means. Newton and Downtown core are ranked 2nd.This is 
#   For property type - Condominium - the Newton and Tanglin areas compared with the other planning areas, have the most significantly different differences between means. Orchard and River Valley areas come in second
#   For property type - EC - the Bishan area compared with the other planning areas, has the most significantly different differences between means. Ang Mo Kio and Bukit Batok areas are ranked second and third respectively 
#   For property type - Detached House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Tanglin and Southern islands (Sentosa) / Bukit Timah / Novena / Marine Parade are ranked second and third respectively
#   For property type - Semi-Detached House - the Tanglin area compared with the other planning areas, has the most significantly different differences between means. The River Valley area are ranked second
#   For property type - Terrance House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Tanglin and Novena areas are joint second. 

MRT Stations(Existing and Planned) by Planning Area

mrt <- read.csv("data/Planning_area_mrt_stations.csv")
mrt$Planning.Area <- toupper(mrt$ï..Planning.Area)
Error in `$<-.data.frame`(`*tmp*`, Planning.Area, value = character(0)) : 
  replacement has 0 rows, data has 39

Detailed Analysis

---
title: "Take-Home Group Assignment"
output: html_notebook
---

```{r}
# Install required libraries
packages <- c('leaflet','dplyr','data.table','sp', 'rgeos', 'raster', 
'rgdal','GISTools','magrittr','BSDA', 'PASWR','broom','tidyverse','gtools')

for(p in packages){
  if(!require(p,character.only = T)){
    install.packages(p)
  }
  library(p,character.only = T)
}

'%!in%' <- function(x,y)!('%in%'(x,y))
```

```{r}
# Install required libraries
data <- read.csv("data/realis2018.csv")
head(data)
```

## Problem Description
One day, your mum tells you that we just won the first prize of TOTO which worth 2,500,000 SGD. After you discuss with your family, you decide to buy a flat and plan for the investment. During the period of time, you contact the company "Property Master@" to discuss more on the historical transaction of Singapore property. The sample data givenis “realis2018.csv”. For problems stated below, we use α= 5%.

```{r}
# clean data

# filter no. of units = 1 cos some transactions are the whole damn building
data %<>% filter(No..of.Units==1)

# recode YISHUN and Yishun
data$Planning.Area <- recode(data$Planning.Area,"YISHUN"="Yishun")
```

# Problem 1
You look around few different planning areas (Column R). When you ask the agent what is the mean Unit price(psm) for Newton flats (Column G), she claims that the mean is higher than 26500. Do you agree with your agent’s suggestion? Explain and justify your answer.

## Assumptions:
* Assume "Flat" means either Condominium, Executive Condominium or Apartment
* Data is a sample of housing prices in 2018

## Code
```{r}
Newton <-  data %>%  filter(Planning.Area=="Newton",Property.Type %in% c("Condominium", "Apartment", "Executive Condominium"), No..of.Units == "1")
head(Newton)
```

```{r}
z.test(Newton$Unit.Price....psm, alternative = "greater", mu= 26500,sigma.x=sd(Newton$Unit.Price....psm))
```

## Detailed Analysis
At 95% Confidence level, we have sufficient evidence to say that the mean price per square meter(psm) of flats in the Newton Area is more than $26500.


# Problem 2
Your friend told you that Newton planning area may not be the best area to choose. He suggested you to consider other planning areas. This is a very difficult decision since you need to conduct a more comprehensive analysis and you also need to justify whether you still choose Newton or another planning area.

## Assumptions:
* Assume no preference for planning area
* Assume purchasing in Q1 2020
* The budget is 2.5 million
* Assume "Flat" means either Condominium, Executive Condominium or Apartment
* Key factors to consider: Accesibility, Estate Maturity
* Metric for evaluation is PSM

## Code
### Distribution of Properties <= 2.5 Million across Planning Areas
```{r}
realis <- fread('data/realis2018.csv')
realis$pa <- toupper(realis$`Planning Area`)
centroids <- readOGR("data/MP14_PLNG_AREA_WEB_PL.shp")
dgp <-  spTransform(centroids, CRS("+proj=longlat +ellps=GRS80"))
one_unit <- subset(realis, realis$`No. of Units` == 1 & realis$`Transacted Price ($)` <= 2500000)
pa_units <- aggregate(realis$`No. of Units`,
                      by = list(realis$pa),
                      FUN = sum)
colnames(pa_units) = c('PA', 'Units')
m <- merge(dgp,pa_units, by.x ='PLN_AREA_N', by.y = 'PA')

pal <-
  colorBin(palette = brewer.pal(10,"YlGnBu"),
           domain = c(0,2000),
           na.color = "#00000000",
           bins=c(0,5,10,50,100,200,400,600,800,1000,1200,1400,1600,1800,2000))
# create the base map, default will be openstreetmap if not selected 
# added centroids point as well
leaflet(dgp) %>% addTiles() %>% 
                 addPolygons(fillColor = ~pal(m$Units),
                             weight = 2,
                             opacity = 1,
                             color = "grey",
                             dashArray = "1",
                             fillOpacity = 0.8) %>% 
                 addLegend("topright", pal, values=(0:2000), 
                           title = "Transacted", 
                           labFormat = labelFormat(suffix = " Units", between = '-')) 
```

```{r}
pa_units[order(-pa_units$Units),]
```

### Available Flats by Planning Area
#### Total Stock
```{r}
stock_data <- fread("data/stock2019Q4.csv")
stock_data$PA <- toupper(stock_data$PA)
stock <- merge(dgp,stock_data, by.x ='PLN_AREA_N', by.y = 'PA')

pal <-
  colorBin(palette = brewer.pal(10,"YlGnBu"),
           domain = c(0,2000),
           na.color = "#00000000",
           bins=c(0,500,1000,3000,5000,8000,10000,15000,20000,30000))
# create the base map, default will be openstreetmap if not selected 
# added centroids point as well
leaflet(dgp) %>% addTiles() %>% 
                 addPolygons(fillColor = ~pal(stock$Total),
                             weight = 2,
                             opacity = 1,
                             color = "grey",
                             dashArray = "1",
                             fillOpacity = 0.8) %>% 
                 addLegend("topright", pal, values=(0:2000), 
                           title = "Total Stock", 
                           labFormat = labelFormat(suffix = " Units", between = '-')) 
```

```{r}
stock_data[order(-stock_data$Total),]
```

### ANOVA and Tukey on PSM per Planning Area for each Flat Type
```{r}

PropType <- unique(data$Property.Type)
for (k in 1:length(unique(data$Property.Type))){
  data_property <-  data %>%  filter(Property.Type==unique(data$Property.Type)[k])
    res.aov <- aov(Unit.Price....psm.~Planning.Area,data=data_property)
    summary(res.aov)
    results <-  tidy(TukeyHSD(res.aov,ordered=TRUE))
    results_sorted <-  results %>% separate(comparison, c("Bigger", "Smaller"),sep = "-")
    
  
    rankings1 <-  results_sorted %>%  group_by(Bigger) %>%  summarise(Count=n()) %>%  arrange(desc(Count))
    rankings2 <- results_sorted %>% filter(Smaller %!in% rankings1$Bigger) %>%  group_by(Smaller) %>% summarise(Count=n())%>%  arrange(Count)
    names(rankings2)[1] <- "Bigger"
    rankings <- rbind(rankings1,rankings2)

    rankings$Rank <-  seq.int(nrow(rankings)) 
    rankings %<>% dplyr::select(-Count)
    results_sorted <- left_join(results_sorted, rankings) %>%  arrange(Rank)

 
    rankings$TukeyRank <- NA
    rankings$TukeyRank[1] <- 1
    for( i in 2:nrow(rankings)){
      if(results_sorted$adj.p.value[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$Smaller == rankings$Bigger[i]]<=0.05){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
      } else if(sum((results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$adj.p.value<=0.05] %in% results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i]&results_sorted$adj.p.value>0.05])>0)){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
       } else {
         rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]
       }
     }
    
    Tukey_ranked <-  rankings %>%  dplyr::select(Bigger,TukeyRank)
    names(Tukey_ranked)[1] <-  "Planning.Area"
    Planning_Mean <-  data_property %>% group_by(Planning.Area) %>%  summarise(mean= mean(Unit.Price....psm.), sd= sd(Unit.Price....psm.))
    Tukey_ranked <- left_join(Tukey_ranked, Planning_Mean)
    
    assign(paste("aov_", PropType[k], sep = ""), tidy(res.aov))
    assign(paste("Tukey_", PropType[k], sep = ""), as.data.frame(Tukey_ranked))
}

head(Tukey_Apartment)
head(Tukey_Condominium)
head(`Tukey_Executive Condominium`)
head(`Tukey_Detached House`)
head(`Tukey_Semi-Detached House`)
head(`Tukey_Terrace House`)

summary(res.aov)
```
```{r}
#Detailed analysis 

#We performed the ANOVA test to determine if all the mean PSM per planning areas were similar.
#At 95% Confidence level, as P value is less than 0.05, we have sufficient evidence to say to reject H0 and that the mean price per square meter(psm) of flats are significantly different
#To compare the group means, a post hoc test -  TUKEY test - was performed per property type 
#Given the extensive results output, the TUKEY results were sorted for clarity. 
#For each property type, the differences were sorted, filtered for results with adjusted p-value <=0.05 and then ranked accordingly 
#At 95% confidence intervals:
#   For property type - apartments - the Orchard area compared with the other planning areas, has the most significantly different differences between means. River Valley, Newton and Downtown core are ranked 2nd.
#   For property type - Condominium - the Orchard and River Vallye areas compared with the other planning areas, have the most significantly different differences between means. The Newton area comes in second
#   For property type - EC - the Bishan area compared with the other planning areas, has the most significantly different differences between means.Sengkang, Punggol areas come in second 
#   For property type - Detached House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Tanglin and Novena / Southern islands (Sentosa) are ranked second and third respectively
#   For property type - Semi-Detached House - the River Valley area compared with the other planning areas, has the most significantly different differences between means. The Tanglin, Marine Parade areas are ranked second
#   For property type - Terrance House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Rochor, River Valley and Novea are joint second. 




```

### ANOVA and Tukey on Transacted Price per Planning Area for each Flat Type
```{r}
# output the damn tukey table and sort
# jayne do anova



for (k in 1:length(unique(data$Property.Type))){
  data_property <-  data %>%  filter(Property.Type==unique(data$Property.Type)[k])
    res.aov <- aov(Transacted.Price....~Planning.Area,data=data_property)
    summary(res.aov)
    results <-  tidy(TukeyHSD(res.aov,ordered=TRUE))
    results_sorted <-  results %>% separate(comparison, c("Bigger", "Smaller"),sep = "-")
    
  
    rankings1 <-  results_sorted %>%  group_by(Bigger) %>%  summarise(Count=n()) %>%  arrange(desc(Count))
    rankings2 <- results_sorted %>% filter(Smaller %!in% rankings1$Bigger) %>%  group_by(Smaller) %>% summarise(Count=n())%>%  arrange(Count)
    names(rankings2)[1] <- "Bigger"
    rankings <- rbind(rankings1,rankings2)

    rankings$Rank <-  seq.int(nrow(rankings)) 
    rankings %<>% dplyr::select(-Count)
    results_sorted <- left_join(results_sorted, rankings) %>%  arrange(Rank)

 
    rankings$TukeyRank <- NA
    rankings$TukeyRank[1] <- 1
    for( i in 2:nrow(rankings)){
      if(results_sorted$adj.p.value[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$Smaller == rankings$Bigger[i]]<=0.05){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
      } else if(sum((results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$adj.p.value<=0.05] %in% results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i]&results_sorted$adj.p.value>0.05])>0)){
        rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
       } else {
         rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]
       }
     }
    
    Tukey_ranked <-  rankings %>%  dplyr::select(Bigger,TukeyRank)
    names(Tukey_ranked)[1] <-  "Planning.Area"
    Planning_Mean <-  data_property %>% group_by(Planning.Area) %>%  summarise(mean= mean(Transacted.Price....), sd= sd(Transacted.Price....))
    Tukey_ranked <- left_join(Tukey_ranked, Planning_Mean)
    
    assign(paste("aov_", PropType[k], sep = ""), tidy(res.aov))
    assign(paste("Tukey_", PropType[k], sep = ""), as.data.frame(Tukey_ranked))
}

head(Tukey_Apartment)
head(Tukey_Condominium)
head(`Tukey_Executive Condominium`)
head(`Tukey_Detached House`)
head(`Tukey_Semi-Detached House`)
head(`Tukey_Terrace House`)
summary(res.aov)

```
```{r}
#Detailed analysis 

#We performed the ANOVA test to determine if the mean transacted price per planning areas were similar.
#At 95% Confidence level, as P value is less than 0.05, we have sufficient evidence to say to reject H0 and that the mean transacted price of flats are significantly different
#To compare the group means, a post hoc test -  TUKEY test - was performed for each property type 
#Given the extensive results output, the TUKEY results were sorted for clarity. 
#For each property type, the differences were sorted, filtered for results with adjusted p-value <=0.05 and then ranked accordingly 
#At 95% confidence intervals:
#   For property type - apartments - the Orchard area compared with the other planning areas, has the most significantly different differences between means. Newton and Downtown core are ranked 2nd.This is 
#   For property type - Condominium - the Newton and Tanglin areas compared with the other planning areas, have the most significantly different differences between means. Orchard and River Valley areas come in second
#   For property type - EC - the Bishan area compared with the other planning areas, has the most significantly different differences between means. Ang Mo Kio and Bukit Batok areas are ranked second and third respectively 
#   For property type - Detached House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Tanglin and Southern islands (Sentosa) / Bukit Timah / Novena / Marine Parade are ranked second and third respectively
#   For property type - Semi-Detached House - the Tanglin area compared with the other planning areas, has the most significantly different differences between means. The River Valley area are ranked second
#   For property type - Terrance House - the Newton area compared with the other planning areas, has the most significantly different differences between means. Tanglin and Novena areas are joint second. 



```

### MRT Stations(Existing and Planned) by Planning Area
```{r}
mrt <- read.csv("data/Planning_area_mrt_stations.csv")
mrt$Planning.Area <- toupper(mrt$ï..Planning.Area)
dgp <-  centroids <- readOGR("data/MP14_PLNG_AREA_WEB_PL.shp")
dgp <-  spTransform(dgp, CRS("+proj=longlat +ellps=GRS80"))
mrt2 <- merge(dgp,mrt, by.x ='PLN_AREA_N', by.y = 'Planning.Area')

#All MRT Stations
mrt_pal <- colorFactor(palette= brewer.pal(15, 'RdYlGn'),
                        domain=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14),
                        na.color = "#00000000")

 
leaflet(dgp) %>% addTiles() %>% addPolygons(fillColor = ~mrt_pal(mrt2$Total.no..of.stations),
                                            weight = 2,
                                            opacity = 1,
                                            color = "grey",
                                            dashArray = "1",
                                            fillOpacity = 0.8) %>% addLegend("topright", mrt_pal, values=(0:14), title= "MRT Stations", labFormat = 
                                                                               labelFormat(suffix = " stations"))

#Only current MRT stations
mrt_pal2 <- colorFactor(palette= brewer.pal(12, 'RdYlGn'),
                        domain=c(0,1,2,3,4,5,6,7,8,9,10,11),
                        na.color = "#00000000")
leaflet(dgp) %>% addTiles() %>% addPolygons(fillColor = ~mrt_pal2(mrt2$No..of.operational.stations),
                                            weight = 2,
                                            opacity = 1,
                                            color = "grey",
                                            dashArray = "1",
                                            fillOpacity = 0.8) %>% addLegend("topright", mrt_pal2, values=(0:11), title= "MRT Stations", labFormat = 
                                                                               labelFormat(suffix = " stations"))

```

## Detailed Analysis

